Abstract
Script to generate figures and differential analysis results for Cui et al.library(Seurat)
library(ggplot2)
library(pheatmap)
library(gridExtra)
library(dplyr)
library(enrichR)
library(DT)
library(knitr)
# Color scheme to be used for this manuscript
main_color <- c("skyblue2", "magenta3", "tomato")
main_pal <- colorRampPalette(c("navy", "red4", "yellow"))(300)
# Create directory to hold results
if (!exists("results")) {dir.create("results")}
if (!exists("RDS")) {dir.create("RDS")}object <- CreateSeuratObject(counts=sample.counts, min.cells=3, min.features = 500, project = "JCV_Day14")#Integration of citeseq data
dir_antibody <- paste0("./data/umi_count/")
tissue.ab <- Read10X(data.dir = dir_antibody, gene.column = 1)
antibody.cells <- colnames(tissue.ab)[colnames(tissue.ab) %in% row.names(object@meta.data)]
tissue.ab <- tissue.ab[, antibody.cells]
# Keep cells that have both RNA and antibody counts
object <- subset(x = object, cells = antibody.cells)
object[["CITE"]] <- CreateAssayObject(counts = tissue.ab)Sample quality before filtering
object$percent.mt <- PercentageFeatureSet(object, pattern="^MT-")
VlnPlot(object, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol=3, pt.size = 0.1)object <- subset(object, subset=nFeature_RNA > 500 & nFeature_RNA < 10000 &
percent.mt < 20 &
nCount_RNA < 100000 & nCount_RNA > 2000)
VlnPlot(object, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol=3, pt.size = 0.1)# Call cell batch by assigning the identity with the highest UMI count
b <- object[["CITE"]]@counts
project <- unlist(lapply(strsplit(row.names(b), "\\-"), function(x) x[1]))
set <- unlist(lapply(strsplit(row.names(b), "\\-"), function(x) x[2]))
set <- paste0("Set_", set)
# Remove cells whose max counts is < 500
filter1 <- apply(as.matrix(b), 2, max)
filter1 <- filter1[which(filter1 > 500)]
b <- b[, colnames(b) %in% names(filter1)]
# Assign cells by max number of umi hash
b <- apply(as.matrix(b), 2, which.max)
object$project <- as.factor(plyr::mapvalues(b, c(1:10), project))
# Assign set of biological replicates to cells
object$set <- as.factor(plyr::mapvalues(b, c(1:10), set))
# Get day14 cells
object <- SetIdent(object, value = "project")
object <- subset(object, idents = "d14")# Function to exclude multiplets
check_multiplet <- function(x) {
# x: each cells taken into consideration
mat <- object[["CITE"]]
mat <- mat[grepl("sg", row.names(mat)), x]
# Max of this should be fewer than 50. If not, it is a doublet and should be removed.
multiplet <- ifelse(max(mat) < 50, "Day14", "multiplet")
return(multiplet)
}
v <- vector()
for (i in row.names(object@meta.data)) {
v.temp <- check_multiplet(i)
v <- c(v, v.temp)
}
table(v)## v
## Day14 multiplet
## 1618 437
object.list <- SplitObject(object, split.by = "set")
object.list <- object.list[c("Set_1", "Set_2", "Set_3")]
object.list <- lapply(object.list, function(x) {
x <- NormalizeData(x, verbose = FALSE)
x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
})
object.anchors <- FindIntegrationAnchors(object.list, dims = 1:20, verbose = FALSE)
integrate <- IntegrateData(anchorset = object.anchors, dims = 1:20, verbose = FALSE)# Read in cc markers
s.genes = cc.genes$s.genes
s.genes[s.genes == "MLF1IP"] <- "CENPU"
g2m.genes = cc.genes$g2m.genes
g2m.genes[g2m.genes == "FAM64A"] <- "PIMREG"
g2m.genes[g2m.genes == "HN1"] <- "JPT1"
integrate <- CellCycleScoring(integrate, s.features = s.genes, g2m.features = g2m.genes, set.ident = FALSE)DefaultAssay(integrate) <- "integrated"
integrate<- ScaleData(integrate, vars.to.regress = c("percent.mt", "nCount_RNA", "nFeature_RNA"), verbose = FALSE)
integrate<- RunPCA(integrate, npcs = 90, ndims.print = 1:5)
integrate<- JackStraw(integrate, dims=40)
integrate<- ScoreJackStraw(integrate, dims = 1:40)
JackStrawPlot(integrate, dims = 1:40)DefaultAssay(integrate) <- "RNA"
integrate <- SetIdent(integrate, value = "integrated_snn_res.0.2")
cluster.markers <- FindAllMarkers(integrate, only.pos = TRUE,preturn.thresh = 1, logfc.threshold = 0)
write.table(cluster.markers, "./results/Markers_byCellType.txt", sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
### Canonical markers for astrocytes:
cluster.markers[which(cluster.markers$gene %in% c("AQP4", "GFAP", "S100B")),]## p_val avg_logFC pct.1 pct.2 p_val_adj cluster gene
## GFAP 1.600014e-53 0.6937675 0.998 0.998 5.023245e-49 0 GFAP
## AQP4 7.656249e-14 0.4568122 0.718 0.617 2.403680e-09 0 AQP4
## S100B 1.786958e-11 0.4826169 0.915 0.884 5.610153e-07 0 S100B
### Canonical markers for GPCs:
cluster.markers[which(cluster.markers$gene %in% c("SOX2", "PAX6", "PDGFRA")),]## p_val avg_logFC pct.1 pct.2 p_val_adj cluster gene
## PAX6 2.676895e-04 0.04515934 0.682 0.568 1.000000e+00 1 PAX6
## SOX2 8.776102e-14 0.26867166 0.994 0.979 2.755257e-09 2 SOX2
## PDGFRA 8.684591e-08 0.23417609 0.611 0.451 2.726527e-03 2 PDGFRA
## PAX61 3.776147e-07 0.14919078 0.725 0.571 1.185521e-02 2 PAX6
Cell cycle scoring
Based on these cluster markers, cell types are then assigned.
integrate$celltype <- plyr::mapvalues(integrate$integrated_snn_res.0.2, 0:2, c("Astrocytes", "GPCs - S Phase", "GPCs - G2M Phase"))
integrate$celltype <- factor(integrate$celltype, levels = c("GPCs - S Phase", "GPCs - G2M Phase", "Astrocytes"))Marker expression in assigned cell clusters
integrate <- SetIdent(integrate, value = "celltype")
VlnPlot(integrate, features = c("SOX2", "PAX6", "PDGFRA"), ncol = 3, pt.size = 0) # Total viral RNA for each cell
Jvgp <- integrate[["RNA"]]@counts[grep("Jvgp", row.names(integrate[["RNA"]]@counts)),]
Jvgp <- apply(Jvgp, 2, sum)
ggplot(as.data.frame(Jvgp), aes(x=Jvgp)) + geom_histogram(bins = 50) + xlim(0, 50)# Assign "uninfected" identity to cells that have a total sum of all detected viral RNAs < 10
Jvgp <- ifelse(Jvgp < 10, "Uninfected", "Infected")
integrate$Infection <- Jvgp
# Split subpopulation architecture view by infection
p1 <- DimPlot(integrate, group.by = "Phase") + scale_color_manual(values = main_color)
p2 <- DimPlot(integrate, group.by = "Phase", split.by = "Infection") + scale_color_manual(values = main_color)
cowplot::plot_grid(p1, p2, ncol = 2, rel_widths = c(0.6, 1))infection.df <- integrate@meta.data[, c("Phase", "Infection")]
ggplot(infection.df, aes(x = Phase)) + geom_bar(aes(fill=Infection), position = "fill") + ylab("Proportion") + scale_fill_manual(values = c("firebrick4", "grey")) + theme(axis.title.x = element_blank(), legend.position = "top")Is the proportion of infected cells per cell cycle phase significant?
zProp_test <- function(Phase1, Phase2) {
# function to carry out two proportion z test: Whether the proportion of infected cells in Phase 1 is higher than the proportion of infected cells in Phase 2
# function returns p value of the z test
# Phase1: string - cell cycle to compare to
# Phase2: string - cell cycle to be compared to
phase1 <- integrate$Infection[which(integrate$Phase == Phase1)]
phase2 <- integrate$Infection[which(integrate$Phase == Phase2)]
infected_phase1 <- sum(phase1 == "Infected")
infected_phase2 <- sum(phase2 == "Infected")
res <- prop.test(x = c(infected_phase1, infected_phase2), n = c(length(phase1), length(phase2)), alternative = "greater")
return(res$p.value)
}
zProp_test("S", "G2M")## [1] 1.356101e-10
## [1] 2.720143e-33
## [1] 0.002321455
p.adjust(c(zProp_test("S", "G2M"), zProp_test("S", "G1"), zProp_test("G2M", "G1")), method = "bonferroni")## [1] 4.068302e-10 8.160429e-33 6.964364e-03
# DE test bewteen Infected and Uninfected within group
integrate <- SetIdent(integrate, value = "Phase")
iuInPhase <- data.frame(p_val=numeric(), avg_logFC=numeric(), pct.1=numeric(), pct.2=numeric(), p_val_adj=numeric(), cluster = character(), gene=character())
for (i in unique(integrate$Phase)) {
tmp <- FindMarkers(integrate, ident.1 = "Infected", group.by = "Infection", subset.ident = i)
tmp$cluster <- rep(i, nrow(tmp))
tmp$gene <- row.names(tmp)
tmp <- tmp[which(tmp$p_val_adj < 0.05),]
iuInPhase <- rbind(iuInPhase, tmp)
}
write.table(iuInPhase, "./results/Markers_InfectedvsUninfected_byCellPhase.txt", sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
iuInPhase[grep("Jvgp", iuInPhase$gene),]## p_val avg_logFC pct.1 pct.2 p_val_adj cluster gene
## Jvgp6 3.175923e-27 1.408206 0.774 0.112 9.970811e-23 G2M Jvgp6
## Jvgp5 2.066132e-25 2.495508 0.957 0.643 6.486620e-21 G2M Jvgp5
## Jvgp4 1.000642e-09 2.278481 1.000 0.965 3.141515e-05 G2M Jvgp4
## Jvgp1 3.152784e-07 1.794574 0.591 0.252 9.898165e-03 G2M Jvgp1
## Jvgp51 9.416325e-117 1.477193 0.965 0.474 2.956255e-112 G1 Jvgp5
## Jvgp61 2.787702e-114 0.691525 0.817 0.149 8.751992e-110 G1 Jvgp6
## Jvgp52 5.995882e-45 2.121556 0.995 0.554 1.882407e-40 S Jvgp5
## Jvgp62 6.093251e-40 1.191401 0.932 0.214 1.912976e-35 S Jvgp6
Jvgp expression by cell cycle phases
p3 <- VlnPlot(integrate, features = c("Jvgp1", "Jvgp4", "Jvgp5", "Jvgp6"), pt.size = 0, split.by = "Infection", combine = FALSE)
p3 <- lapply(p3, function(x) {x <- x + scale_fill_manual(values = c("firebrick4", "grey")) + theme(legend.position = "none", axis.title = element_blank()); return(x)})
cowplot::plot_grid(plotlist = p3, ncol = 4)# Write out data frame for WGCNA
dat <- integrate[["RNA"]]@data
saveRDS(dat, "./RDS/WGCNA_data.RDS")Turquoise module eigengene and Jvgp expression superimposed on cellular architecture
integrate$Turquoise <- MEG$MEturquoise
p4 <- FeaturePlot(integrate, "Turquoise", cols = c("snow", "turquoise4"), pt.size = 0.8) + ggtitle("Turquoise - Module Eigengene") + theme(title = element_text(size = 9))
p5 <- FeaturePlot(integrate, c("Jvgp1", "Jvgp4", "Jvgp5", "Jvgp6"), combine = FALSE)
p5 <- lapply(p5, function(x) {x <- x + scale_color_gradient2() +
theme_void() +
theme(legend.position = "none", plot.title = element_text(hjust=0.5)); return(x)})
legend <- cowplot::get_legend(FeaturePlot(integrate, "Jvgp4") + scale_color_gradient2(breaks = c(0,6), labels = c("low","high")))
cowplot::plot_grid(p4,
cowplot::plot_grid(plotlist = p5, ncol =2),
cowplot::plot_grid(NULL, legend, rel_widths = c(0.2,1)),
ncol = 3, rel_widths = c(2,1.4,1))Correlation between turquoise module eigengene and mean Jvgp expression
ggplot(integrate@meta.data, aes(x=Turquoise, y = mean_Jvgp)) + geom_point(aes(color = Infection), size = 0.8) +
scale_color_manual(values = c("firebrick4", "grey")) +
geom_smooth(data = integrate@meta.data, aes(x=Turquoise, y = mean_Jvgp), method = "lm", se = FALSE, color = "turquoise4") +
xlab("Turquoise - Module Eigengene") + ylab("log10 - Average Jvgp Expression") + ylim(-1.5,1) + theme_classic() +
theme(legend.position = "top") + ggtitle("cor = 0.21")Within module turquoise, what genes are dysregulated by cell cycle phase or upon viral infection"
## [1] "RNA"
# Bewteen G2M and S Phase
integrate <- SetIdent(integrate, value = "Phase")
g2ms <- FindMarkers(integrate, ident.1 = "S", ident.2 = "G2M")
write.table(g2ms, "./results/DEG_S_vs_G2M.txt", sep = "\t", row.names = TRUE, col.names = NA, quote = FALSE)
# Between Infected and Non-Infected
integrate <- SetIdent(integrate, value = "Infection")
iu <- FindMarkers(integrate, ident.1 = "Infected", ident.2 = "Uninfected")
write.table(iu, "./results/DEG_Infected_vs_Uninfected.txt", sep = "\t", row.names = TRUE, col.names = NA, quote = FALSE)Gene ontology analysis for turquoise modular genes
# Extract the signif pathways
dynamicColor <- readRDS("./RDS/ModuleMembership.RDS")
turquoise <- names(dynamicColor)[which(dynamicColor == "turquoise")]
dbs <- "GO_Biological_Process_2018"
# enrichR has since updated their website. Point to the new enrichR website
options(enrichR.base.address="https://amp.pharm.mssm.edu/Enrichr/")
eTurquoise <- enrichr(genes = turquoise, databases = dbs)## Uploading data to Enrichr... Done.
## Querying GO_Biological_Process_2018... Done.
## Parsing results... Done.
eTurquoise <- eTurquoise[[1]]
eTurquoise <- eTurquoise[order(eTurquoise$Adjusted.P.value),]
eTurquoise <- eTurquoise[which(eTurquoise$Adjusted.P.value < 0.001),]
write.table(eTurquoise, "./results/GO_turquoise_BP2018.txt", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t")Create node and edge data frames for Gephi and Cytoscape
options(stringsAsFactors = FALSE)
pway.df <- eTurquoise[,c(1,9)]
df <- data.frame(Source = character(), Target = character())
for (i in 1:nrow(pway.df)) {
goi <- pway.df$Genes[i]
goi <- strsplit(goi, ";")[[1]]
tmp.df <- data.frame(Source = goi, Target = rep(pway.df$Term[i], length(goi)))
df <- rbind(df, tmp.df)
}Dysregulated genes within module turquoise
# Genes that are deemed intersting could be within the following comparison:
## Circle: Infected vs Uninfected
circle <- row.names(iu)[which(iu$p_val_adj < 0.001)]
## Square: Phase markers
square <- row.names(g2ms)[which(g2ms$p_val_adj < 0.001)]
genes_of_interest <- c(circle, square)
df <- df[df$Source %in% genes_of_interest, ]
## Remove terms that have fewer than 10 genes
keep <- table(df$Target)[which(table(df$Target) > 10)]
keep <- names(keep)
df <- df[df$Target %in% keep,]
# Remove GO term ID
df$Target <- gsub("\\ \\(GO\\:.*", "", df$Target)
df$Target <- gsub("\\,", "_", df$Target)
write.csv(df,"./results/scWGCNA_forGephi_edgeList.csv", row.names = FALSE, quote = FALSE)
# df is the edge file
# df2 is the node file
df2 <- data.frame(id = c(df$Source, df$Target))
df2$label <- c(rep("Gene", nrow(df)), rep("Pathway", nrow(df)))
row.names(df2) <- NULL
df2 <- df2[!duplicated(df2), ]
df2$Infection <- ifelse(df2$id %in% circle, "Yes", "No")
df2$Phase <- rep(0, nrow(df2))
g2ms$Phase <- ifelse(g2ms$avg_logFC >=0, "S", "G2M")
for (i in 1:nrow(df2)) {
if (df2$label[i] != "Pathway" & df2$id[i] %in% row.names(g2ms)) {
df2$Phase[i] <- g2ms$Phase[which(row.names(g2ms) == df2$id[i])]
} else {
df2$Phase[i] <- NA
}
}
write.csv(df2,"./results/scWGCNA_forGephi_nodeList.csv", row.names = FALSE, quote = FALSE)mod <- read.csv("./data/scWGCNA_forCytoscape_nodeList_postGephi.csv", stringsAsFactors = F)
mod <- mod[which(mod$Label == "Gene"), ]
# Add in Jvgp genes
tmp <- data.frame(Id = c("Jvgp1", "Jvgp2", "Jvgp4", "Jvgp5", "Jvgp6"),
Label = rep("Gene", 5),
timeset = rep(NA, 5),
infection = rep("Yes", 5),
Phase = c(NA, "G2M", NA, "S", "S"),
modularity_class = rep(4, 5),
indegree = rep(NA, 5),
outdegree = rep(NA, 5),
Degree = rep(NA,5))
mod <- rbind(mod, tmp)
# Create annotation for gene names
annotRow <- data.frame(row.names = mod$Id)
annotRow$Phase <- mod$Phase
###########################################
###########################################
# cluster average
DefaultAssay(integrate)## [1] "RNA"
integrate <- SetIdent(integrate, value = "Phase")
ca <- AverageExpression(integrate, assays = "RNA")
ca <- ca$RNA
###########################################
###########################################
# Heatmap color
annot_colors <- list(Phase=c("magenta3", "tomato"))
names(annot_colors[[1]]) <- c("G2M", "S")p.list <- list()
for (i in 0:4) {
goi <- mod$Id[mod$modularity_class == i]
datatoplot3 <- log2(ca[goi,] + 1)
tmp.plot <- pheatmap(as.matrix(datatoplot3), annotation_row = annotRow[goi,,drop=FALSE], annotation_colors = annot_colors,cluster_rows = T, cluster_cols = F, color = main_pal, border_color = NA, scale = "row", cellwidth = 10, cellheight = 8, silent = TRUE, clustering_method = "average")
p.list[[i+1]] <- tmp.plot[[4]]
}
grid.arrange(arrangeGrob(grobs = p.list, ncol =5))## R version 3.5.1 (2018-07-02)
## Platform: x86_64-conda_cos6-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux
##
## Matrix products: default
## BLAS/LAPACK: /gpfs/fs2/scratch/sgoldman_lab/.conda/envs/scRNA_v3.1/lib/R/lib/libRblas.so
##
## locale:
## [1] LC_CTYPE=en_US.utf-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.utf-8 LC_COLLATE=en_US.utf-8
## [5] LC_MONETARY=en_US.utf-8 LC_MESSAGES=en_US.utf-8
## [7] LC_PAPER=en_US.utf-8 LC_NAME=en_US.utf-8
## [9] LC_ADDRESS=en_US.utf-8 LC_TELEPHONE=en_US.utf-8
## [11] LC_MEASUREMENT=en_US.utf-8 LC_IDENTIFICATION=en_US.utf-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] knitr_1.30 DT_0.11 enrichR_2.1 dplyr_1.0.2
## [5] gridExtra_2.3 pheatmap_1.0.12 ggplot2_3.3.2 Seurat_3.1.1.9002
##
## loaded via a namespace (and not attached):
## [1] tsne_0.1-3 nlme_3.1-142 bitops_1.0-6
## [4] RcppAnnoy_0.0.14 RColorBrewer_1.1-2 httr_1.4.2
## [7] sctransform_0.2.0 tools_3.5.1 R6_2.4.1
## [10] irlba_2.3.3 KernSmooth_2.23-16 mgcv_1.8-31
## [13] uwot_0.1.4 lazyeval_0.2.2 colorspace_1.4-1
## [16] withr_2.3.0 npsurv_0.4-0 tidyselect_1.1.0
## [19] curl_4.3 compiler_3.5.1 plotly_4.9.1
## [22] labeling_0.3 caTools_1.17.1.3 scales_1.1.1
## [25] lmtest_0.9-37 ggridges_0.5.1 pbapply_1.4-3
## [28] stringr_1.4.0 digest_0.6.26 rmarkdown_1.18
## [31] R.utils_2.9.1 pkgconfig_2.0.3 htmltools_0.5.0
## [34] bibtex_0.4.2.3 htmlwidgets_1.5.2 rlang_0.4.8
## [37] farver_2.0.3 generics_0.0.2 zoo_1.8-6
## [40] jsonlite_1.7.1 ica_1.0-2 gtools_3.8.1
## [43] R.oo_1.23.0 magrittr_1.5 Matrix_1.2-18
## [46] Rcpp_1.0.5 munsell_0.5.0 ape_5.4-1
## [49] reticulate_1.13 lifecycle_0.2.0 R.methodsS3_1.7.1
## [52] stringi_1.5.3 yaml_2.2.1 gbRd_0.4-11
## [55] MASS_7.3-51.4 gplots_3.0.1.1 Rtsne_0.15
## [58] plyr_1.8.6 grid_3.5.1 parallel_3.5.1
## [61] gdata_2.18.0 listenv_0.8.0 ggrepel_0.8.1
## [64] crayon_1.3.4 lattice_0.20-38 cowplot_1.0.0
## [67] splines_3.5.1 SDMTools_1.1-221.1 pillar_1.4.6
## [70] igraph_1.2.6 rjson_0.2.20 future.apply_1.3.0
## [73] reshape2_1.4.4 codetools_0.2-16 leiden_0.3.1
## [76] glue_1.4.2 evaluate_0.14 lsei_1.2-0
## [79] metap_1.1 RcppParallel_4.4.4 data.table_1.12.6
## [82] vctrs_0.3.4 png_0.1-7 Rdpack_0.11-0
## [85] gtable_0.3.0 RANN_2.6.1 purrr_0.3.4
## [88] tidyr_1.0.0 future_1.15.1 xfun_0.18
## [91] rsvd_1.0.2 survival_3.1-7 viridisLite_0.3.0
## [94] tibble_3.0.4 cluster_2.1.0 globals_0.12.4
## [97] fitdistrplus_1.0-14 ellipsis_0.3.1 ROCR_1.0-7